{
    "trainer": {
        "evaluation_strategy": "steps",
        "per_device_train_batch_size": 7,
        "per_device_eval_batch_size": 7,
        "gradient_accumulation_steps": 5,
        "eval_steps": 150,
        "save_steps": 150,
        "logging_steps": 10,
        "learning_rate": 0.00005,
        "num_train_epochs": 3,
        "lr_scheduler_type": "cosine",
        "warmup_steps": 100,
        "fp16": false,
        "bf16": true,
        "gradient_checkpointing": false,
        "torch_compile": false,
        "optim": "adamw_torch",
        "half_precision_backend": "auto",
        "fp16_opt_level": "O2"
    },
    "deepspeed": {
        "bf16": {
            "enabled": true
        },
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
                "weight_decay": "auto"
            }
        },
        "zero_optimization": {
            "stage": 2,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": true
            },
            "overlap_comm": true,
            "round_robin_gradients": true
        },
        "train_batch_size": "auto",
        "gradient_accumulation_steps": "auto"
    },
    "model_name": "bigscience/mt0-large",
    "model_type": "seq2seq",
    "max_source_tokens_count": 256,
    "max_target_tokens_count": 256
}

